{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "id": "f608662a-766c-4fce-92f4-d4b2987a7b4b",
   "metadata": {},
   "outputs": [],
   "source": [
    "import requests\n",
    "from bs4 import BeautifulSoup"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 16,
   "id": "605d5e32-e1c5-4641-8d52-5a5b8e8050fe",
   "metadata": {},
   "outputs": [],
   "source": [
    "headers = [\n",
    "    \"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/110.0.0.0 Safari/537.36\",\n",
    "    \"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_2) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/35.0.1916.153 Safari/537.36\",\n",
    "    \"Mozilla/5.0 (Windows NT 6.1; WOW64; rv:30.0) Gecko/20100101 Firefox/30.0\",\n",
    "    \"Mozilla/5.0 (Linux; Android 8.0.0; SM-G955U Build/R16NW) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/87.0.4280.141 Mobile Safari/537.36\",\n",
    "    \"Mozilla/5.0 (compatible; MSIE 10.0; Windows NT 6.2; Win64; x64; Trident/6.0)\"\n",
    "]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 17,
   "id": "cf0138d6-2f23-4381-bab2-0c83c3126567",
   "metadata": {},
   "outputs": [],
   "source": [
    "header = {'User-Agent': headers[0]}\n",
    "url = \"https://movie.douban.com/top250\""
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 18,
   "id": "37a29042-b301-456c-b392-91f254f34dc4",
   "metadata": {},
   "outputs": [],
   "source": [
    "def getInfo(url):\n",
    "    res = []\n",
    "    html = requests.get(url, headers=header).content\n",
    "    soup = BeautifulSoup(html, 'html')\n",
    "    arr = soup.select('div#content ol.grid_view>li')\n",
    "    for li in arr:\n",
    "        title = li.select('.title')[0].text\n",
    "        rating = li.select('.rating_num')[0].text\n",
    "        res.append(title + ',' + rating)\n",
    "    return res"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 19,
   "id": "9c5d4648-0524-439c-8935-8706771943d4",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "['肖申克的救赎,9.7', '霸王别姬,9.6', '阿甘正传,9.5', '泰坦尼克号,9.5', '这个杀手不太冷,9.4', '千与千寻,9.4', '美丽人生,9.5', '星际穿越,9.4', '盗梦空间,9.4', '辛德勒的名单,9.5', '楚门的世界,9.4', '忠犬八公的故事,9.4', '海上钢琴师,9.3', '三傻大闹宝莱坞,9.2', '放牛班的春天,9.3', '机器人总动员,9.3', '疯狂动物城,9.2', '无间道,9.3', '控方证人,9.6', '大话西游之大圣娶亲,9.2', '熔炉,9.4', '教父,9.3', '触不可及,9.3', '当幸福来敲门,9.2', '寻梦环游记,9.1']\n"
     ]
    }
   ],
   "source": [
    "print(getInfo(url))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 20,
   "id": "c7a44e59-c4a6-495f-9e5c-d66fca93760d",
   "metadata": {},
   "outputs": [],
   "source": [
    "#构造采集网址数组\n",
    "urls=[url+'?start='+str(i) for i in range(0,250,25)]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "id": "4ec4e3da-a74f-437a-a87e-7a18d57eba15",
   "metadata": {},
   "outputs": [],
   "source": [
    "result=[]\n",
    "for u in urls:\n",
    "    result.extend(getInfo(u))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 15,
   "id": "fc2680e5-bd6f-411d-b356-57b3459f7bfd",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "250\n",
      "['肖申克的救赎,9.7', '霸王别姬,9.6', '阿甘正传,9.5', '泰坦尼克号,9.5', '这个杀手不太冷,9.4', '千与千寻,9.4', '美丽人生,9.5', '星际穿越,9.4', '盗梦空间,9.4', '辛德勒的名单,9.5', '楚门的世界,9.4', '忠犬八公的故事,9.4', '海上钢琴师,9.3', '三傻大闹宝莱坞,9.2', '放牛班的春天,9.3', '机器人总动员,9.3', '疯狂动物城,9.2', '无间道,9.3', '控方证人,9.6', '大话西游之大圣娶亲,9.2', '熔炉,9.4', '教父,9.3', '触不可及,9.3', '当幸福来敲门,9.2', '寻梦环游记,9.1', '末代皇帝,9.3', '龙猫,9.2', '怦然心动,9.1', '活着,9.3', '哈利·波特与魔法石,9.2', '蝙蝠侠：黑暗骑士,9.2', '指环王3：王者无敌,9.3', '我不是药神,9.0', '乱世佳人,9.3', '飞屋环游记,9.1', '素媛,9.3', '哈尔的移动城堡,9.1', '十二怒汉,9.4', '何以为家,9.1', '让子弹飞,9.0', '摔跤吧！爸爸,9.0', '猫鼠游戏,9.1', '天空之城,9.2', '鬼子来了,9.3', '少年派的奇幻漂流,9.1', '海蒂和爷爷,9.3', '钢琴家,9.3', '大话西游之月光宝盒,9.0', '指环王2：双塔奇兵,9.2', '闻香识女人,9.1', '死亡诗社,9.2', '绿皮书,8.9', '罗马假日,9.1', '大闹天宫,9.4', '天堂电影院,9.2', '指环王1：护戒使者,9.1', '黑客帝国,9.1', '教父2,9.3', '狮子王,9.1', '辩护人,9.2', '饮食男女,9.2', '搏击俱乐部,9.0', '本杰明·巴顿奇事,9.0', '美丽心灵,9.1', '穿条纹睡衣的男孩,9.2', '窃听风暴,9.2', '情书,8.9', '两杆大烟枪,9.1', '西西里的美丽传说,8.9', '看不见的客人,8.8', '音乐之声,9.1', '阿凡达,8.8', '哈利·波特与死亡圣器(下),9.0', '拯救大兵瑞恩,9.1', '飞越疯人院,9.1', '小鞋子,9.2', '沉默的羔羊,8.9', '布达佩斯大饭店,8.9', '功夫,8.8', '禁闭岛,8.9', '蝴蝶效应,8.9', '致命魔术,8.9', '哈利·波特与阿兹卡班的囚徒,8.9', '心灵捕手,8.9', '超脱,9.0', '低俗小说,8.9', '海豚湾,9.3', '摩登时代,9.3', '春光乍泄,9.0', '美国往事,9.2', '喜剧之王,8.8', '致命ID,8.9', '杀人回忆,8.9', '七宗罪,8.8', '红辣椒,9.1', '加勒比海盗,8.8', '哈利·波特与密室,8.9', '一一,9.1', '狩猎,9.1', '唐伯虎点秋香,8.7', '7号房的礼物,8.9', '被嫌弃的松子的一生,8.9', '蝙蝠侠：黑暗骑士崛起,8.9', '请以你的名字呼唤我,8.8', '爱在黎明破晓前,8.8', '断背山,8.8', '剪刀手爱德华,8.7', '入殓师,8.9', '第六感,8.9', '重庆森林,8.8', '勇敢的心,8.9', '超能陆战队,8.7', '甜蜜蜜,8.9', '幽灵公主,8.9', '爱在日落黄昏时,8.9', '菊次郎的夏天,8.9', '借东西的小人阿莉埃蒂,8.9', '消失的爱人,8.7', '寄生虫,8.8', '阳光灿烂的日子,8.8', '天使爱美丽,8.7', '完美的世界,9.1', '小森林 夏秋篇,9.0', '倩女幽魂,8.8', '无人知晓,9.1', '时空恋旅人,8.8', '侧耳倾听,8.9', '未麻的部屋,9.1', '哈利·波特与火焰杯,8.8', '幸福终点站,8.8', '驯龙高手,8.8', '小森林 冬春篇,9.0', '一个叫欧维的男人决定去死,8.9', '教父3,9.0', '怪兽电力公司,8.8', '玩具总动员3,8.9', '傲慢与偏见,8.7', '萤火之森,8.9', '新世界,8.9', '釜山行,8.6', '被解救的姜戈,8.8', '神偷奶爸,8.7', '茶馆,9.6', '告白,8.8', '玛丽和马克思,9.0', '哪吒闹海,9.2', '大鱼,8.8', '色，戒,8.7', '九品芝麻官,8.7', '喜宴,9.0', '模仿游戏,8.8', '头号玩家,8.7', '射雕英雄传之东成西就,8.7', '花样年华,8.8', '我是山姆,9.0', '头脑特工队,8.8', '阳光姐妹淘,8.8', '七武士,9.3', '血战钢锯岭,8.7', '恐怖直播,8.7', '惊魂记,9.0', '黑客帝国3：矩阵革命,8.8', '你的名字。,8.5', '电锯惊魂,8.7', '三块广告牌,8.7', '达拉斯买家俱乐部,8.8', '疯狂原始人,8.7', '心迷宫,8.7', '谍影重重3,8.8', '英雄本色,8.6', '上帝之城,9.0', '风之谷,8.9', '纵横四海,8.8', '卢旺达饭店,8.9', '海街日记,8.8', '爱在午夜降临前,8.9', '绿里奇迹,8.9', '小丑,8.7', '记忆碎片,8.7', '疯狂的石头,8.6', '背靠背，脸对脸,9.5', '雨中曲,9.1', '心灵奇旅,8.7', '2001太空漫游,8.9', '岁月神偷,8.7', '忠犬八公物语,9.2', '无间道2,8.7', '荒蛮故事,8.8', '小偷家族,8.7', '无敌破坏王,8.7', '爆裂鼓手,8.7', '冰川时代,8.6', '恐怖游轮,8.5', '贫民窟的百万富翁,8.6', '牯岭街少年杀人事件,8.9', '东邪西毒,8.6', '魔女宅急便,8.7', '遗愿清单,8.7', '东京教父,9.0', '大佛普拉斯,8.7', '你看起来好像很好吃,8.9', '可可西里,8.9', '真爱至上,8.6', '黑天鹅,8.6', '城市之光,9.3', '源代码,8.5', '海边的曼彻斯特,8.6', '雨人,8.7', '波西米亚狂想曲,8.6', '初恋这件小事,8.5', '恋恋笔记本,8.5', '青蛇,8.6', '人工智能,8.7', '末路狂花,8.9', '虎口脱险,8.9', '终结者2：审判日,8.8', '疯狂的麦克斯4：狂暴之路,8.7', '罗生门,8.8', '新龙门客栈,8.7', '无耻混蛋,8.7', '千钧一发,8.8', '崖上的波妞,8.6', '芙蓉镇,9.3', '萤火虫之墓,8.7', '花束般的恋爱,8.6', '彗星来的那一夜,8.6', '爱乐之城,8.4', '奇迹男孩,8.6', '黑客帝国2：重装上阵,8.7', '二十二,8.7', '哈利·波特与死亡圣器(上),8.5', '血钻,8.7', '战争之王,8.7', '火星救援,8.5', '步履不停,8.8', '房间,8.8', '魂断蓝桥,8.8', '千年女优,8.8', '谍影重重2,8.7', '白日梦想家,8.6', '哈利·波特与凤凰社,8.5', '弱点,8.7', '蜘蛛侠：平行宇宙,8.6', '高山下的花环,9.5', '谍影重重,8.6', '阿飞正传,8.5', '朗读者,8.6', '再次出发之纽约遇见你,8.6', '燃情岁月,8.7', '大红灯笼高高挂,8.8']\n"
     ]
    }
   ],
   "source": [
    "print(len(result))\n",
    "print(result)"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3 (ipykernel)",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.11.5"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}
